{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install git+https://github.com/huseinzol05/malaya.git@4.6.1 --no-deps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "from malaya.preprocessing import Tokenizer\n",
    "from malaya.text.function import case_of\n",
    "from malaya.augmentation import (\n",
    "    replace_similar_consonants, \n",
    "    replace_similar_vowels, \n",
    "    socialmedia_form,\n",
    "    vowel_alternate)\n",
    "from malaya.text import rules\n",
    "from collections import defaultdict\n",
    "import random\n",
    "import re\n",
    "import tensorflow as tf\n",
    "from malaya.text.tatabahasa import alphabet, consonants, vowels\n",
    "from malaya.text.function import augmentation_textcleaning, simple_textcleaning\n",
    "\n",
    "def cleaning_row(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "replace_normalizer = defaultdict(list)\n",
    "for k, v in rules.rules_normalizer.items():\n",
    "    if v.count(' ') == 0:\n",
    "        replace_normalizer[v].append(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['jugak', 'juge']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def socialmedia_form(word: str):\n",
    "    \"\"\"\n",
    "    augmenting a word into socialmedia form.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    word: str\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[str]\n",
    "    \"\"\"\n",
    "\n",
    "    word = simple_textcleaning(word)\n",
    "    if not len(word):\n",
    "        raise ValueError('word is too short to augment shortform.')\n",
    "\n",
    "    results = []\n",
    "\n",
    "    if len(word) > 1:\n",
    "\n",
    "        if word[-1] == 'a' and word[-2] in consonants:\n",
    "            results.append(word[:-1] + 'e')\n",
    "\n",
    "        if word[0] == 'f' and word[-1] == 'r':\n",
    "            results.append('p' + word[1:])\n",
    "    \n",
    "        if word[-2] in consonants and word[-1] in vowels:\n",
    "            results.append(word + 'k')\n",
    "\n",
    "        if word[-2] in vowels and word[-1] == 'h':\n",
    "            results.append(word[:-1])\n",
    "\n",
    "    if len(word) > 2:\n",
    "        if word[-3] in consonants and word[-2:] == 'ar':\n",
    "            results.append(word[:-2] + 'o')\n",
    "\n",
    "        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:\n",
    "            results.append(word[1:])\n",
    "\n",
    "        if word[-3] in consonants and word[-2:] == 'ng':\n",
    "            results.append(word[:-2] + 'g')\n",
    "\n",
    "        if word[1:3] == 'ng':\n",
    "            results.append(word[:1] + x[2:])\n",
    "\n",
    "    return list(set(results))\n",
    "\n",
    "socialmedia_form('juga')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makan ayam di kampung Jawa'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def random_slide(string, min_n = 2):\n",
    "    splitted = string.split()\n",
    "    n = random.randint(min_n, len(splitted))\n",
    "    i = random.randint(0, len(splitted) - n)\n",
    "    return ' '.join(splitted[i: i + n])\n",
    "\n",
    "random_slide('Husein makan ayam di kampung Jawa juga')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['word']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'word'.split('-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'abad kek-14-14-14-14'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = Tokenizer(duration = False, date = False).tokenize\n",
    "\n",
    "def augment(string):\n",
    "    \n",
    "    r = []\n",
    "    for word in tokenizer(string):\n",
    "        original_word = word\n",
    "        word_lower = word.lower()\n",
    "        try:\n",
    "            if word.istitle() or word.isupper():\n",
    "                if random.random() >= 0.3:\n",
    "                    word = case_of(word)(random.choice(replace_normalizer[word_lower]))\n",
    "            else:\n",
    "                splitted = word_lower.split('-')\n",
    "                if len(splitted) > 1:\n",
    "                    word = splitted[0]\n",
    "                    after = '-'.join(splitted[1:])\n",
    "                else:\n",
    "                    after = ''\n",
    "                s = socialmedia_form(word_lower)\n",
    "                if len(s):\n",
    "                    word = case_of(word)(random.choice(s))\n",
    "                else:\n",
    "                    if word_lower in replace_normalizer and random.random() >= 0.3:\n",
    "                        word = case_of(word)(random.choice(replace_normalizer[word_lower]))\n",
    "\n",
    "                word = case_of(word)(vowel_alternate(word, 0.7))\n",
    "                word = case_of(word)(replace_similar_consonants(word, 0.95))\n",
    "                word = case_of(word)(replace_similar_vowels(word, 0.8))\n",
    "            \n",
    "                if len(after):\n",
    "                    word = f'{word}-{after}'\n",
    "                \n",
    "        except Exception as e:\n",
    "            word = original_word\n",
    "            pass\n",
    "        \n",
    "        r.append(word)\n",
    "    return ' '.join(r)\n",
    "\n",
    "augment('abad ke-14-14-14-14')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makn ayam dik kg Jawa juge .'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = \"\"\"\n",
    "Husein makan ayam di kampung Jawa juga\n",
    "\"\"\"\n",
    "splitted = malaya.text.function.split_into_sentences(string)\n",
    "augment(splitted[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makan ayam dk kg Jawa .'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = \"\"\"\n",
    "Husein makan ayam di kampung Jawa\n",
    "\"\"\"\n",
    "splitted = malaya.text.function.split_into_sentences(string)\n",
    "augment(splitted[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makan ayam di kampung Jawa.'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splitted[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['/home/husein/pure-text/filtered-dumping-wiki.txt',\n",
    "        '/home/husein/pure-text/dumping-cleaned-news.txt',]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2037249"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[0]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['other']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "def loop(strings):\n",
    "    results = []\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        try:\n",
    "            if fast_text.predict([strings[i]])[0] == 'other':\n",
    "                continue\n",
    "            if random.random() > 0.6:\n",
    "                s = random_slide(strings[i])\n",
    "                if not len(s):\n",
    "                    s = strings[i]\n",
    "            else:\n",
    "                s = strings[i]\n",
    "            t = ' '.join(tokenizer(s))\n",
    "            if random.random() >= 0.2:\n",
    "                row = augment(s)\n",
    "                results.append((row, t))\n",
    "            else:\n",
    "                results.append((t, t))\n",
    "        except:\n",
    "            pass\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['serbak', 'serbe']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "socialmedia_form('serba')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 1856.54it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bolak sepk Belanda yg nermain',\n",
       "  'Dirk Jan Klaas \" Klaas-Jan \" Huntelaar ( lahir 12 Ogos 1983 ) merupakan pemain bola sepak Belanda yang bermain'),\n",
       " ('Beliau kinok bermain tuk klab Ajax .',\n",
       "  'Beliau kini bermain untuk kelab Ajax .'),\n",
       " ('Hypo-Arena .', 'Hypo-Arena .'),\n",
       " ('dikenali sebagai ) ialah sebuah', 'dikenali sebagai ) ialah sebuah'),\n",
       " ('Ia merupakn stadiim team Austria Karnten .',\n",
       "  'Ia merupakan stadium pasukan Austria Karnten .'),\n",
       " ('Stadium lama dikenali sebagai Wortherseestadion , dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900 .',\n",
       "  'Stadium lama dikenali sebagai Wortherseestadion , dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900 .'),\n",
       " ('Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru , juga dikenali sehingga 30 Jun 2007 dengan nama \" Wortherseestadion \" .',\n",
       "  'Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru , juga dikenali sehingga 30 Jun 2007 dengan nama \" Wortherseestadion \" .'),\n",
       " ('Ia adla sala satk dripadk 8 stdium utk UEFA Euro 2008 , dan dobne utk menamping 32,000 penonton .',\n",
       "  'Ia adalah salah satu daripada 8 stadium untuk UEFA Euro 2008 , dan dibina untuk menampung 32,000 penonton .'),\n",
       " ('acre tersebiy , kapasitk stadium inok sdg diprtimbnfkn utk dikurngkm kepadak',\n",
       "  'acara tersebut , kapasiti stadium ini sedang dipertimbangkan untuk dikurangkan kepada'),\n",
       " ('prshabatan dik amtare Austria', 'persahabatan di antara Austria')]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loop(data[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 127328/127328 [01:14<00:00, 1702.91it/s]\n",
      "100%|██████████| 1/1 [00:00<00:00, 1219.63it/s]60.03it/s]\n",
      "100%|██████████| 127328/127328 [01:16<00:00, 1669.30it/s]\n",
      "100%|██████████| 127328/127328 [01:16<00:00, 1667.24it/s]\n",
      "100%|██████████| 127328/127328 [01:17<00:00, 1653.37it/s]\n",
      "100%|██████████| 127328/127328 [01:16<00:00, 1653.89it/s]\n",
      "100%|██████████| 127328/127328 [01:17<00:00, 1650.10it/s]\n",
      "100%|██████████| 127328/127328 [01:21<00:00, 1565.66it/s]\n",
      "100%|██████████| 127328/127328 [01:22<00:00, 1544.89it/s]\n",
      "100%|██████████| 127328/127328 [01:22<00:00, 1536.30it/s]\n",
      "100%|██████████| 127328/127328 [01:23<00:00, 1524.38it/s]\n",
      "100%|██████████| 127328/127328 [01:23<00:00, 1527.30it/s]\n",
      "100%|██████████| 127328/127328 [01:23<00:00, 1522.07it/s]\n",
      "100%|██████████| 127328/127328 [01:23<00:00, 1518.42it/s]\n",
      "100%|██████████| 127328/127328 [01:24<00:00, 1502.69it/s]\n",
      "100%|██████████| 127328/127328 [01:24<00:00, 1501.38it/s]\n",
      "100%|██████████| 127328/127328 [01:26<00:00, 1471.03it/s]\n"
     ]
    }
   ],
   "source": [
    "import cleaning\n",
    "\n",
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2033099/2033099 [00:00<00:00, 2255652.40it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7365666895709456"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "not_same = 0\n",
    "for r in tqdm(results1):\n",
    "    if r[0] != r[1]:\n",
    "        not_same += 1\n",
    "\n",
    "not_same / len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2033099/2033099 [00:34<00:00, 58728.25it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('spelling-correction-wiki.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(results1))):\n",
    "        l = cleaning_row(results1[i][0])\n",
    "        r = cleaning_row(results1[i][1])\n",
    "        outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3483869"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[1]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 217741/217741 [02:26<00:00, 1488.10it/s]\n",
      "100%|██████████| 13/13 [00:00<00:00, 1667.41it/s].34it/s]\n",
      "100%|██████████| 217741/217741 [02:27<00:00, 1479.33it/s]\n",
      "100%|██████████| 217741/217741 [02:26<00:00, 1487.89it/s]\n",
      "100%|██████████| 217741/217741 [02:26<00:00, 1485.05it/s]\n",
      "100%|██████████| 217741/217741 [02:28<00:00, 1462.91it/s]\n",
      "100%|██████████| 217741/217741 [02:29<00:00, 1461.21it/s]\n",
      "100%|██████████| 217741/217741 [02:28<00:00, 1462.65it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1445.70it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1451.15it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1451.52it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1447.91it/s]\n",
      "100%|██████████| 217741/217741 [02:31<00:00, 1440.86it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1444.52it/s]\n",
      "100%|██████████| 217741/217741 [02:30<00:00, 1448.46it/s]\n",
      "100%|██████████| 217741/217741 [02:31<00:00, 1434.80it/s]\n",
      "100%|██████████| 217741/217741 [02:32<00:00, 1429.16it/s]\n"
     ]
    }
   ],
   "source": [
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3414470/3414470 [00:01<00:00, 2221148.74it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7739716559231741"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "not_same = 0\n",
    "for r in tqdm(results1):\n",
    "    if r[0] != r[1]:\n",
    "        not_same += 1\n",
    "\n",
    "not_same / len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3414470/3414470 [01:05<00:00, 51997.72it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('spelling-correction-news.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(results1))):\n",
    "        l = cleaning_row(results1[i][0])\n",
    "        r = cleaning_row(results1[i][1])\n",
    "        outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.cloud import storage\n",
    "\n",
    "client = storage.Client()\n",
    "bucket = client.bucket('mesolitica-tpu-general')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "blob = bucket.blob('t5-data-v2/spelling-correction-wiki.tsv')\n",
    "blob.upload_from_filename('spelling-correction-wiki.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "blob = bucket.blob('t5-data-v2/spelling-correction-news.tsv')\n",
    "blob.upload_from_filename('spelling-correction-news.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def spelling_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'spelling-correction-wiki.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def spelling_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['ejaan: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('spelling_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'spelling_dataset',\n",
    "    dataset_fn = spelling_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [spelling_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"spelling_dataset\")\n",
    "ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={\"inputs\": 256, \"targets\": 256})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'ejaan: \" Nota : \" Perkataan deep krungan hndakla dnyanyion ole kuit , yng lainnye dnyanyokan secark slok .',\n",
       " 'inputs': array([28699,    31,    13,     6, 21387,    13,    31,    13,     6,\n",
       "        22653,  4081,  8092,  9756,    13,  7999,  7131,   472,    13,\n",
       "           79,    38,  2721,  1186,    13,  9134,    13,   848,   545,\n",
       "           13,    14,    13,   128,  2460,   116,  2721,    81,    13,\n",
       "           79,    38,  2721,   162,   103,    13, 13384, 11680,    13,\n",
       "           16, 12330,    13,     3,     1]),\n",
       " 'targets_plaintext': b'\" Nota : \" Perkataan dalam kurungan hendaklah dinyanyikan oleh koir , yang lainnya dinyanyikan secara solo .',\n",
       " 'targets': array([   13,     6, 21387,    13,    31,    13,     6, 22653,    36,\n",
       "        25840,  9339, 28420,    60,    13,  1232,  1022,    13,    14,\n",
       "           17, 11301, 28420,   156,  7186,    13,     3,     1])}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
